# -*- coding: utf-8 -*-
import scrapy
from scrapy.http import Request
import re
from taobao_demo.items import TaobaoDemoItem
import sys
import urllib

class TbbotSpider(scrapy.Spider):
    name = 'tbbot'
    allowed_domains = ['taobao.com']
    #start_urls = ['http://taobao.com/']
    start_urls = ['https://taobao.com/']
    num = 0

    def parse(self, response):
        key = '小提琴'
        for i in range(0,1):
            url = 'https://s.taobao.com/search?q=' + str(key) + '&s=' + str(44*i)
            print url
            yield Request(url=url,callback=self.page)

    def page(self,response):
        body = response.body.decode('utf-8','ignore')
        pattern_id ='"nid":"(.*?)"'
        all_id = re.compile(pattern_id).findall(body)
        for i in range(0,len(all_id)):
            # all item url share some feature
            url = 'https://item.taobao.com/item.htm?id=' + str(all_id[i])
            yield Request(url=url,callback=self.next)

    def next(self,response):
        item = TaobaoDemoItem()
        pattern_url = 'https://(.*?).com'
        url = response.url # Althouth url is constructed by item.taobao.com , it may be converted to tmall automatically
        subdomain = re.compile(pattern_url).findall(url)
        if subdomain[0] != 'item.taobao':
            # tmall
            title = response.xpath("//div[@class='tb-detail-hd']/h1/text()").extract()
            pattern_price = '"defaultItemPrice":"(.*?)"'
            price = re.compile(pattern_price).findall(response.body.decode('utf-8', 'ignore'))
            pattern_id = 'id=(.*?)&'
        else:
            title = response.xpath("//h3[@class='tb-main-title']/@data-title").extract()
            price = response.xpath("//em[@class = 'tb-rmb-num']/text()").extract() 
            pattern_id = 'id=(.*?)$'
        
        item['title'] = title
        item['link'] = url
        item['price'] = price

        item_id = re.compile(pattern_id).findall(url)[0] # Is it the previous item id = all_id[i]
        # 由于comment无法在源码中查看，必须由人点击才能看到的。所以只能通过F12 在浏览器中抓取链接。原作者认为淘宝和天猫
        # 共同的comment url都可用下式组成
        comment_url = 'https://dsr-rate.tmall.com/list_dsr_info.htm?itemId=' + str(item_id)
        comment_data = urllib.urlopen(comment_url).read().decode('utf-8', 'ignore')
        pattern_comment = '"rateTotal":(.*?),"' # 好像淘宝的评价数用这个前缀求rateTotal，和页面中的值不一致
        # 淘宝评论url前缀可以考虑用https://rate.taobao.com/detailCommon.htm?auctionNumId=,求Total值，更为准确
        comment = re.compile(pattern_comment).findall(comment_data)
        item['comment'] = comment

        yield item
